import os
import time

import torch
#pip install modelscope transformers peft diffusers
from modelscope import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen3-8B"
os.environ['MODELSCOPE_CACHE'] = 'D:/hwm_4032442470/Models'#
# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto",
)
print(model.device)
def  qwen3_8B(prompt):
    # prepare the model input
    #prompt = "Give me a short introduction to large language model."
    messages = [
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False # Switches between thinking and non-thinking modes. Default is True.
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    # conduct text completion
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=32768
    )
    output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()

    # parsing thinking content
    try:
        # rindex finding 151668 (</think>)
        index = len(output_ids) - output_ids[::-1].index(151668)
    except ValueError:
        index = 0

    thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
    content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

    #print("thinking content:", thinking_content)
    #print("content:", content)
    return content
if __name__=="__main__":
    startTime=time.time()
    r=qwen3_8B("男生和女生谁更适合当老师？")
    print(r)
    endTime=time.time()
    print(endTime-startTime)